This notebook demonstrates some of the basic functionality of librosa version 0.4.
Following through this example, you'll learn how to:
In [1]:
from __future__ import print_function
In [ ]:
# We'll need numpy for some mathematical operations
import numpy as np
# Librosa for audio
import librosa
# matplotlib for displaying the output
import matplotlib.pyplot as plt
%matplotlib inline
# And seaborn to make it look nice
import seaborn
seaborn.set(style='ticks')
# and IPython.display for audio output
import IPython.display
In [3]:
audio_path = librosa.util.example_audio_file()
# or uncomment the line below and point it at your favorite song:
#
# audio_path = '/path/to/your/favorite/song.mp3'
y, sr = librosa.load(audio_path)
By default, librosa will resample the signal to 22050Hz.
You can change this behavior by saying:
librosa.load(audio_path, sr=44100)
to resample at 44.1KHz, or
librosa.load(audio_path, sr=None)
to disable resampling.
You might want to stop here to verify that scikits.samplerate is installed and functioning correctly. Without this, audio resampling will fall back on scipy.signal.resample(), which is rather inefficient.
You can test this by printing out the librosa.core.audio._HAS_SAMPLERATE flag:
In [4]:
print('HAS_SAMPLERATE: ', librosa.core.audio._HAS_SAMPLERATE)
This first step will show how to compute a Mel spectrogram from an audio waveform.
In [5]:
# Let's make and display a mel-scaled power (energy-squared) spectrogram
S = librosa.feature.melspectrogram(y, sr=sr, n_mels=128)
# Convert to log scale (dB). We'll use the peak power as reference.
log_S = librosa.logamplitude(S, ref_power=np.max)
# Make a new figure
plt.figure(figsize=(12,4))
# Display the spectrogram on a mel scale
# sample rate and hop length parameters are used to render the time axis
librosa.display.specshow(log_S, sr=sr, x_axis='time', y_axis='mel')
# Put a descriptive title on the plot
plt.title('mel power spectrogram')
# draw a color bar
plt.colorbar(format='%+02.0f dB')
# Make the figure layout compact
plt.tight_layout()
In [6]:
y_harmonic, y_percussive = librosa.effects.hpss(y)
In [7]:
# What do the spectrograms look like?
# Let's make and display a mel-scaled power (energy-squared) spectrogram
S_harmonic = librosa.feature.melspectrogram(y_harmonic, sr=sr)
S_percussive = librosa.feature.melspectrogram(y_percussive, sr=sr)
# Convert to log scale (dB). We'll use the peak power as reference.
log_Sh = librosa.logamplitude(S_harmonic, ref_power=np.max)
log_Sp = librosa.logamplitude(S_percussive, ref_power=np.max)
# Make a new figure
plt.figure(figsize=(12,6))
plt.subplot(2,1,1)
# Display the spectrogram on a mel scale
librosa.display.specshow(log_Sh, sr=sr, y_axis='mel')
# Put a descriptive title on the plot
plt.title('mel power spectrogram (Harmonic)')
# draw a color bar
plt.colorbar(format='%+02.0f dB')
plt.subplot(2,1,2)
librosa.display.specshow(log_Sp, sr=sr, x_axis='time', y_axis='mel')
# Put a descriptive title on the plot
plt.title('mel power spectrogram (Percussive)')
# draw a color bar
plt.colorbar(format='%+02.0f dB')
# Make the figure layout compact
plt.tight_layout()
Next, we'll extract Chroma features to represent pitch class information.
In [8]:
# We'll use a CQT-based chromagram here. An STFT-based implementation also exists in chroma_cqt()
# We'll use the harmonic component to avoid pollution from transients
C = librosa.feature.chroma_cqt(y=y_harmonic, sr=sr)
# Make a new figure
plt.figure(figsize=(12,4))
# Display the chromagram: the energy in each chromatic pitch class as a function of time
# To make sure that the colors span the full range of chroma values, set vmin and vmax
librosa.display.specshow(C, sr=sr, x_axis='time', y_axis='chroma', vmin=0, vmax=1)
plt.title('Chromagram')
plt.colorbar()
plt.tight_layout()
Mel-frequency cepstral coefficients are commonly used to represent texture or timbre of sound.
In [9]:
# Next, we'll extract the top 13 Mel-frequency cepstral coefficients (MFCCs)
mfcc = librosa.feature.mfcc(S=log_S, n_mfcc=13)
# Let's pad on the first and second deltas while we're at it
delta_mfcc = librosa.feature.delta(mfcc)
delta2_mfcc = librosa.feature.delta(mfcc, order=2)
# How do they look? We'll show each in its own subplot
plt.figure(figsize=(12, 6))
plt.subplot(3,1,1)
librosa.display.specshow(mfcc)
plt.ylabel('MFCC')
plt.colorbar()
plt.subplot(3,1,2)
librosa.display.specshow(delta_mfcc)
plt.ylabel('MFCC-$\Delta$')
plt.colorbar()
plt.subplot(3,1,3)
librosa.display.specshow(delta2_mfcc, sr=sr, x_axis='time')
plt.ylabel('MFCC-$\Delta^2$')
plt.colorbar()
plt.tight_layout()
# For future use, we'll stack these together into one matrix
M = np.vstack([mfcc, delta_mfcc, delta2_mfcc])
In [10]:
# Now, let's run the beat tracker.
# We'll use the percussive component for this part
plt.figure(figsize=(12, 6))
tempo, beats = librosa.beat.beat_track(y=y_percussive, sr=sr)
# Let's re-draw the spectrogram, but this time, overlay the detected beats
plt.figure(figsize=(12,4))
librosa.display.specshow(log_S, sr=sr, x_axis='time', y_axis='mel')
# Let's draw transparent lines over the beat frames
plt.vlines(beats, 0, log_S.shape[0], colors='r', linestyles='-', linewidth=2, alpha=0.5)
plt.axis('tight')
plt.colorbar(format='%+02.0f dB')
plt.tight_layout()
By default, the beat tracker will trim away any leading or trailing beats that don't appear strong enough.
To disable this behavior, call beat_track() with trim=False.
In [11]:
print('Estimated tempo: %.2f BPM' % tempo)
print('First 5 beat frames: ', beats[:5])
# Frame numbers are great and all, but when do those beats occur?
print('First 5 beat times: ', librosa.frames_to_time(beats[:5], sr=sr))
# We could also get frame numbers from times by librosa.time_to_frames()
In [12]:
# feature.sync will summarize each beat event by the mean feature vector within that beat
M_sync = librosa.feature.sync(M, beats)
plt.figure(figsize=(12,6))
# Let's plot the original and beat-synchronous features against each other
plt.subplot(2,1,1)
librosa.display.specshow(M)
plt.title('MFCC-$\Delta$-$\Delta^2$')
# We can also use pyplot *ticks directly
# Let's mark off the raw MFCC and the delta features
plt.yticks(np.arange(0, M.shape[0], 13), ['MFCC', '$\Delta$', '$\Delta^2$'])
plt.colorbar()
plt.subplot(2,1,2)
librosa.display.specshow(M_sync)
# librosa can generate axis ticks from arbitrary timestamps and beat events also
librosa.display.time_ticks(librosa.frames_to_time(beats, sr=sr))
plt.yticks(np.arange(0, M_sync.shape[0], 13), ['MFCC', '$\Delta$', '$\Delta^2$'])
plt.title('Beat-synchronous MFCC-$\Delta$-$\Delta^2$')
plt.colorbar()
plt.tight_layout()
In [13]:
# Beat synchronization is flexible.
# Instead of computing the mean delta-MFCC within each beat, let's do beat-synchronous chroma
# We can replace the mean with any statistical aggregation function, such as min, max, or median.
C_sync = librosa.feature.sync(C, beats, aggregate=np.median)
plt.figure(figsize=(12,6))
plt.subplot(2, 1, 1)
librosa.display.specshow(C, sr=sr, y_axis='chroma', vmin=0.0, vmax=1.0, x_axis='time')
plt.title('Chroma')
plt.colorbar()
plt.subplot(2, 1, 2)
librosa.display.specshow(C_sync, y_axis='chroma', vmin=0.0, vmax=1.0)
beat_times = librosa.frames_to_time(beats, sr=sr)
librosa.display.time_ticks(beat_times)
plt.title('Beat-synchronous Chroma (median aggregation)')
plt.colorbar()
plt.tight_layout()